food_train <- read.csv("data/food_train.csv")
food_test <- read.csv("data/food_test.csv")
food_nutrients <- read.csv("data/food_nutrients.csv")
nutrients <- read.csv("data/nutrients.csv")
For convenient, I’ll change the names of the categories only for part A:
levels(food_train$category) <- c("cakes","candy", "chips", "chocolate", "cookies", "popcorn" )
fulltext_plotly <- data_plot_fulltext %>% mutate( pct = paste0((round(n/sum(n)*100, 2))," %")) %>%
ggplot(aes(x =reorder(household_serving_update, -entropy), y = n , fill = category ,
text = paste( "Houshold serving:", household_serving_update,"\n Category:", category,"\n Count", n, "\n Percentage", pct,
"\n Total n", sum_n , "\n Entropy" , round(entropy,2) ))) +
geom_col(position = "fill", color = "white", size = .3) + coord_flip() + theme_light() + theme(text = element_text(size=10)) +
labs(x="", y="", title = "Keywords by Category", subtitle = "Ordered by Entropy (ascending)") +
scale_y_continuous(labels = percent)
ingredients_plotly <- n_top_ingredient(ingredients_data_train, 1000) %>% group_by(ingredient) %>%
mutate(pct = paste0((round(n/sum(n)*100, 2))," %") ,sum_n = sum(n)) %>% filter(sum_n>3000) %>%
ggplot(aes(category,reorder(ingredient, sum_n) ,fill= n,
text = paste("Ingredient: " , ingredient , "\n Category: " ,category, "\n Count: ", n, "\n Pct: " , pct, "\n Total: " , sum_n ))) +
scale_fill_gradient2(low = "#F9EBEA", mid = "#CD6155", high = "#922B21", midpoint = 3000) +
geom_tile( ) + labs(x="", y="", title = "Top Ingredients by Category",
subtitle = "Color by Count of Products Contain the Ingredient") +
theme_bw() + theme(axis.text = element_text(size = 8))
ggplotly(ingredients_plotly, tooltip = "text")
description_plotly <- top_words(food_train, "description", by_category = T) %>% group_by(word) %>%
mutate(n= n, sum_n = sum(n), pct = n/sum(n), entropy = entropy_fun(pct) ) %>%
mutate( pct = paste0((round(pct*100, 2))," %")) %>% group_by(word) %>% filter(sum_n>750 ) %>%
ggplot(aes(x = reorder(word, -entropy), y = n , fill = category ,
text = paste( "Description word:", word,"\n Category:", category,"\n Count", n, "\n Percentage", pct,
"\n Total n", sum_n , "\n Entropy" , round(entropy,2) ))) +
geom_col(position = "fill", color = "white", size = .3) + coord_flip() + theme_light() +theme(text = element_text(size=10)) +
labs(x="", y="", title = "Descroption Common Words by Category", subtitle = "Ordered by Entropy (Ascending)") +
scale_y_continuous(labels = percent)
ggplotly(description_plotly, tooltip = "text")
# For each of the units of measurement, I normalized the average amount to be between 0 and 1, so that they would be comparable.
plotly_nutr <- nutr_data2 %>% group_by(unit_name) %>%
mutate(normalize_mean_amount = mean_amount/max(mean_amount),name = factor(name, levels = order_nutrient)) %>%
ggplot(aes(x = category,y = name , fill= normalize_mean_amount,
text = paste("Nutrient", name, "\n Category", category, "\n Average Amount", round(mean_amount,2),
"\n Normalizes mean", normalize_mean_amount , "\n Count", n))) +
geom_tile() + scale_fill_gradient(low="#CDFFF7", high="#007F5F", name = "") +
theme_bw() + theme(axis.text = element_text(size = 6.5), text = element_text(size=9),
axis.text.x = element_text(size = 8), legend.position = "right") +
labs(title = "Normalized Average Amount of Food Nutrition by Snack's Category",
y = "Nutrient Name", x="", subtitle = "For each of the unit sizes - normalized average amount between 0 to 1",
caption = "Sorted by the most common nutrient(on top), and goes down to the less common ones")
ggplotly(plotly_nutr, tooltip = "text")